# Dylan Carlson Sirvent León (dylancarlson@g.harvard.edu)
# Code defines function read_eia. It is necessary to read scraped EIA
# xlsx files. Due to irregular formatting, readxl functions struggle to read these files.

library(readxl)
library(tidyverse)
library(janitor)
library(glue)
library(here)

read_eia <- function(year, month_no, row_to_name = 2) {
    
  month_names <- c("january", "february", "march", "april", "may", "june",
                   "july", "august", "september", "october", "november", "december")
  month_name <- month_names[as.numeric(month_no)]
  
  file_path <- here("data", "input", "EIA-860M", glue("{year}_{month_no}_{month_name}.xlsx"))
  
  sheet_names <- excel_sheets(file_path)
  
  xlsx_list <- sheet_names |>
    lapply(function(sheet) {
      
      raw_data <- suppressMessages(suppressWarnings(read_xlsx(file_path, sheet = sheet, col_names = FALSE, progress = FALSE)))
      
      potential_headers <- raw_data[2, ]
      if (any(is.na(potential_headers)) || length(unique(potential_headers)) != length(potential_headers)) {
        row_to_name <- 3
      }
      
      cleaned_data <- raw_data |>
        row_to_names(row_number = row_to_name) |>
        set_names(~ str_replace_all(., "[\r\n_]+", "")) |>
        set_names(~ str_replace_all(., " ", "_")) |>      
        set_names(~ str_replace(., "^_", ""))             
      
      return(cleaned_data)
    })
  
  names(xlsx_list) <- sheet_names
  
  return(xlsx_list)
}